# Loading email data set: consists of "text data" and "dummy variable"
email_df <- data.table::fread( "spam_or_not_spam.csv", header = T )
# Randomizing data
set.seed(46234)
email_df <- email_df[ sample( 1:nrow( email_df ) ), ]
# Factoring dummy variable
email_df$label <- factor( email_df$label )library(pacman)
p_load(fastverse, magrittr, here, skimr, dplyr,
ggplot2, ggthemes, equatiomatic, gridExtra,
caret, naivebayes, knitr, kableExtra, shiny,
data.table, tidymodels, ggthemes, wordcloud,
tm, SnowballC, RColorBrewer, e1071, data.table,
stringr, disk.frame, knitr, ranger, DT
)## Installing package into '/Users/zoearnaut-hull/Library/R/3.6/library'
## (as 'lib' is unspecified)
## Warning: package 'disk.frame' is not available (for R version 3.6.1)
## Warning in p_install(package, character.only = TRUE, ...):
## Warning in library(package, lib.loc = lib.loc, character.only = TRUE,
## logical.return = TRUE, : there is no package called 'disk.frame'
## Warning in p_load(fastverse, magrittr, here, skimr, dplyr, ggplot2, ggthemes, : Failed to install/load:
## disk.frame
To retrieve the root of a word (eg, doing -> do), options are āstemmingā &ālemmatizationā.
STEMMING: faster but maybe not as effective
LEMMATIZATION: slower but more effective
More on this: Here
VectorSource(): creates one document for each email
Vcorpus(): creates a volatile corpus from these individual emails
email_corpus <- VCorpus(
VectorSource(
email_df$email
)
)
# Using `tm` package to stem email content
email_corpus <- tm::tm_map( email_corpus,
tm::stemDocument )
# Removing puctuations
email_corpus = tm_map( email_corpus,
removePunctuation )
# Removing stopwords
email_corpus <- tm_map( email_corpus,
removeWords, stopwords( "en" ) )
# OPTIONAL -- Removing two most frequent stopwords: "NUMBER", "URL"
email_corpus <- tm_map( email_corpus,
removeWords, c("NUMBER", "number", "url", "URL") )
# Testing keeping URL
# email_corpus <- tm_map( email_corpus,
# removeWords, c("NUMBER", "number") )
# OPTIONAL -- Removing extra white space
# email_corpus <- tm_map( email_corpus,
# stringr::str_squish)
# DocumentTermMatrix(): tokenize the email corpus.
email_dtm <- tm::DocumentTermMatrix( sample( email_corpus, length( email_corpus ) ) )Visualing text data after cleaning and pre-processing
wordcloud( reverse_email$text,
max.words = 150,
colors = brewer.pal( 7, "Dark2" ),
random.order = FALSE ) # Subsetting to spam == 1
spam <- reverse_email %>% filter( type == 1 )
wordcloud( spam$text,
max.words = 150,
colors = brewer.pal( 7, "Dark2" ),
random.order = FALSE,
main = "Spam")# Subsetting to spam == 0
ham <- reverse_email %>% filter( type == 0 )
wordcloud( ham$text,
max.words = 150,
colors = brewer.pal( 7, "Dark2" ),
random.order = FALSE,
main = "Non-Spam") prop_table = data.frame(c(prop.table( table( email_train_label ) )[[2]], #Train
prop.table( table( email_train_label ) )[[1]]),
c(prop.table( table( email_test_label ) )[[2]], # Test
prop.table( table( email_test_label ) )[[1]])
)
rownames(prop_table) = c("Spam", "Non-Spam")
names(prop_table) = c("Train", "Test")
kable(prop_table, digits = 3) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))| Train | Test | |
|---|---|---|
| Spam | 0.17 | 0.152 |
| Non-Spam | 0.83 | 0.848 |
We currently have 25,050 variables (too many!)
Defining threshold (eg. 1 == 1%) Goal: Eliminate words that appear in __% of records in the training data
# I know there's probably a quicker way todo this but hey it was only 14 lines
freq_words_plus_1 = c(freq_words, "")
freq_words_dt = data.frame(
c(freq_words_plus_1[seq( 1+0*1,16*1)]),
c(freq_words_plus_1[seq(1*16+1,16*2)]),
c(freq_words_plus_1[seq(2*16+1,16*3)]),
c(freq_words_plus_1[seq(3*16+1,16*4)]),
c(freq_words_plus_1[seq(4*16+1,16*5)]),
c(freq_words_plus_1[seq(5*16+1,16*6)]),
c(freq_words_plus_1[seq(6*16+1,16*7)]),
c(freq_words_plus_1[seq(7*16+1,16*8)]),
c(freq_words_plus_1[seq(8*16+1,16*9)]),
c(freq_words_plus_1[seq(9*16+1,16*10)]),
c(freq_words_plus_1[seq(10*16+1,16*11)]),
c(freq_words_plus_1[seq(11*16+1,16*12)]),
c(freq_words_plus_1[seq(12*16+1,16*13)])
)
# datatable(freq_words_dt, rownames = FALSE, colnames = FALSE, filter="none", options = list(pageLength = 13, scrollX=T) )
kable(freq_words_dt, digits = 3, col.names = NULL) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>%
column_spec (1:13,border_left = T, border_right = T) %>%
row_spec(1:16, extra_css = "border-bottom: 1px solid")| actual | can | differ | found | instal | long | much | onli | problem | right | site | test | want |
| address | case | doe | free | interest | look | must | order | process | rpm | softwar | thank | way |
| also | chang | doesn | get | internet | lot | name | origin | product | run | someth | thing | web |
| american | check | don | give | invest | made | nation | packag | program | said | spam | think | week |
| ani | click | good | issu | need | part | provid | say | spamassassin | time | well | ||
| anoth | code | end | got | just | make | net | peopl | put | secur | sponsor | today | whi |
| anyon | com | error | govern | keep | manag | network | perl | rate | see | start | trade | will |
| back | come | even | great | know | mani | never | person | razor | seem | state | tri | window |
| base | compani | everi | group | last | market | new | phone | read | send | still | two | within |
| becaus | comput | exmh | help | life | may | next | place | real | sent | subject | type | without |
| befor | countri | file | high | like | mean | now | pleas | realli | sep | support | unit | work |
| best | current | find | home | line | messag | numbertnumber | point | receiv | server | sure | unsubscrib | world |
| better | data | first | hyperlink | link | might | offer | post | releas | servic | system | use | write |
| build | date | follow | idea | linux | million | old | power | remov | set | take | user | wrote |
| busi | day | fork | includ | list | money | onc | price | report | show | talk | veri | year |
| call | develop | form | inform | live | month | one | probabl | requir | sinc | technolog | version |
# I got all of this code from stackoverflow by doing some simple googling for how to display a confusion matrix
## Source https://stackoverflow.com/questions/23891140/r-how-to-visualize-confusion-matrix-using-the-caret-package
draw_confusion_matrix <- function(cm) {
total <- sum(cm$table)
res <- as.numeric(cm$table)
# Generate color gradients. Palettes come from RColorBrewer.
greenPalette <- c("#F7FCF5","#E5F5E0","#C7E9C0","#A1D99B","#74C476","#41AB5D","#238B45","#006D2C","#00441B")
redPalette <- c("#FFF5F0","#FEE0D2","#FCBBA1","#FC9272","#FB6A4A","#EF3B2C","#CB181D","#A50F15","#67000D")
getColor <- function (greenOrRed = "green", amount = 0) {
if (amount == 0)
return("#FFFFFF")
palette <- greenPalette
if (greenOrRed == "red")
palette <- redPalette
colorRampPalette(palette)(100)[10 + ceiling(90 * amount / total)]
}
# set the basic layout
layout(matrix(c(1,1,2)))
par(mar=c(2,2,2,2))
plot(c(100, 345), c(300, 450), type = "n", xlab="", ylab="", xaxt='n', yaxt='n')
title('Confusion Matrix', cex.main=2)
# create the matrix
classes = colnames(cm$table)
rect(150, 430, 240, 370, col=getColor("green", res[1]))
text(195, 435, "Non-Spam", cex=1.2)
rect(250, 430, 340, 370, col=getColor("red", res[3]))
text(295, 435, "Spam", cex=1.2)
text(125, 370, 'Predicted', cex=1.3, srt=90, font=2)
text(245, 450, 'Actual', cex=1.3, font=2)
rect(150, 305, 240, 365, col=getColor("red", res[2]))
rect(250, 305, 340, 365, col=getColor("green", res[4]))
text(140, 400, "Non-Spam", cex=1.2, srt=90)
text(140, 335, "Spam", cex=1.2, srt=90)
# add in the cm results
text(195, 400, res[1], cex=1.6, font=2, col='black')
text(195, 335, res[2], cex=1.6, font=2, col='black')
text(295, 400, res[3], cex=1.6, font=2, col='black')
text(295, 335, res[4], cex=1.6, font=2, col='black')
# add in the specifics
plot(c(100, 0), c(100, 0), type = "n", xlab="", ylab="", main = "Metrics", xaxt='n', yaxt='n')
text(10, 85, names(cm$byClass[1]), cex=1.2, font=2)
text(10, 70, round(as.numeric(cm$byClass[1]), 3), cex=1.2)
text(30, 85, names(cm$byClass[2]), cex=1.2, font=2)
text(30, 70, round(as.numeric(cm$byClass[2]), 3), cex=1.2)
text(50, 85, names(cm$byClass[5]), cex=1.2, font=2)
text(50, 70, round(as.numeric(cm$byClass[5]), 3), cex=1.2)
text(70, 85, names(cm$byClass[6]), cex=1.2, font=2)
text(70, 70, round(as.numeric(cm$byClass[6]), 3), cex=1.2)
text(90, 85, names(cm$byClass[7]), cex=1.2, font=2)
text(90, 70, round(as.numeric(cm$byClass[7]), 3), cex=1.2)
# add in the accuracy information
text(30, 35, names(cm$overall[1]), cex=1.5, font=2)
text(30, 20, round(as.numeric(cm$overall[1]), 3), cex=1.4)
text(70, 35, names(cm$overall[2]), cex=1.5, font=2)
text(70, 20, round(as.numeric(cm$overall[2]), 3), cex=1.4)
} # Splitting for 5-fold cross-validation
folds <- email_train %>% vfold_cv(v = 5)
# Defining Lambdas (from Lecture 005)
lambdas <- data.frame( penalty = c( 0, 10^seq( from = 5, to = -2, length = 100 ) ) )
# Defining the recipe
data_recipe <- recipe(
email_train_label ~ .,
data = email_train
) %>%
update_role(V1, new_role = 'id variable') %>%
step_dummy(all_nominal(), - all_outcomes())
# Defining Lasso Model
lasso <- linear_reg(
penalty = tune(),
mixture = 1) %>%
set_engine("glmnet")
# Setting up workflow
workflow_lasso <- workflow() %>%
add_recipe( data_recipe ) %>%
add_model( lasso )
# Parallelize
doParallel::registerDoParallel(cores = 4)
# CV
lasso_cv <- workflow_lasso %>%
tune_grid(
resamples = folds,
grid = lambdas,
metrics = metric_set(rmse, mae)
)## Warning: package 'rlang' was built under R version 3.6.2
## Warning: package 'vctrs' was built under R version 3.6.2
## Warning: package 'glmnet' was built under R version 3.6.2
## Warning: package 'Matrix' was built under R version 3.6.2
# Find best models ( source: juliasilge.com/blog/lasso-the-office/ )
lasso_cv %>% collect_metrics() %>%
ggplot(aes(penalty, mean, color = .metric)) +
geom_errorbar(aes(
ymin = mean - std_err,
ymax = mean + std_err
),
alpha = 0.5
) +
geom_line(size = 1.5) +
facet_wrap(~.metric, scales = "free", nrow = 2) +
theme(legend.position = "none") + theme_base() +
# xlim(0, 1000) +
scale_x_log10()best_lasso_mae = lasso_cv %>% show_best(metric = "mae") %>% filter(penalty == min(penalty))
best_lasso_rmse = lasso_cv %>% show_best(metric = "rmse") %>% filter(penalty == min(penalty))
best_lasso = rbind(best_lasso_mae, best_lasso_rmse)
names(best_lasso) = c("Penalty", "Metric", "Estimator", "Mean", "n", "Standard Error", ".config")
kable(best_lasso, digits = 3) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))| Penalty | Metric | Estimator | Mean | n | Standard Error | .config |
|---|---|---|---|---|---|---|
| 0.010 | mae | standard | 0.283 | 5 | 0.002 | Preprocessor1_Model002 |
| 0.031 | rmse | standard | 0.376 | 5 | 0.004 | Preprocessor1_Model009 |
# Get test accuracy
best_lasso_workflow = workflow_lasso %>%
finalize_workflow(select_best(lasso_cv, metric = "mae")) %>%
fit(data = email_train)
best_lasso = best_lasso_workflow %>% extract_fit_parsnip()
## Clean test data
email_test_clean = recipe(
email_test_label ~ .,
data = email_test
) %>%
update_role(V1, new_role = 'id variable') %>%
step_dummy(all_nominal(), - all_outcomes()) %>%
prep() %>% juice()
## Make predictions
lasso_predictions = predict(best_lasso, email_test_clean)
lasso_predictions = lasso_predictions %>% mutate(prediction = ifelse(.pred < 0.5, 0, 1))
email_test_clean$predictions = lasso_predictions$prediction
## Calculate accuracy
email_test_clean = email_test_clean %>% mutate(accurate = ifelse(predictions == email_test_label, 1, 0))
acc = sum(email_test_clean$accurate) / nrow(email_test_clean)
# print(acc)# Set seed
set.seed(9753)
# converting outcome variable into character vector
email_train <- email_train %>%
mutate(
outcome_as_vector = ifelse(email_train_label == 1, "Yes", "No")
)
# Split for 5-fold cross-validation
folds <- vfold_cv(email_train, v = 5)
# Defining the recipe
data_recipe <- recipe(
outcome_as_vector ~ .,
data = email_train
) %>%
step_rm(email_train_label) %>%
update_role(V1, new_role = 'id variable') %>%
# step_num2factor( email_train_label ) %>%
step_dummy(all_nominal(), - all_outcomes()) %>%
step_zv(all_predictors()) %>%
step_normalize(all_predictors())
# Defining Lambdas (from Lecture 005)
lambdas <- data.frame( penalty = c( 0, 10^seq( from = 5, to = -2, length = 100 ) ) )
# Defining Lasso Model
log_lasso <- logistic_reg(
mode = 'classification',
penalty = tune(),
mixture = 1) %>%
set_engine("glmnet")
# Setting up workflow
workflow_lasso <- workflow() %>%
add_recipe( data_recipe ) %>%
add_model( log_lasso )
# Parallelize
# doParallel::registerDoParallel(cores = 4)
# CV
log_lasso_cv <- workflow_lasso %>%
tune_grid(
resamples = folds,
metrics = metric_set(accuracy, roc_auc, sens, spec, precision),
grid = grid_latin_hypercube(penalty(), size = 5),
# grid = grid_latin_hypercube(penalty() , mixture(), size = 20),
control = control_grid(parallel_over = 'resamples')
)# Find best models ( source: juliasilge.com/blog/lasso-the-office/ )
# log_lasso_cv %>% collect_metrics() %>% arrange(mean) %>%
# ggplot(aes(penalty, mean, color = .metric)) +
# # geom_line(size = 1.5) +
# # geom_errorbar(aes(
# # ymin = mean - std_err,
# # ymax = mean + std_err
# # ),
# # alpha = 0.5
# # ) +
# geom_smooth(aes(penalty, mean, color = .metric)
# , size = 1
# , alpha = 0.5
# , method = "lm") +
# # facet_wrap(~.metric, scales = "free", nrow = 3) +
# # scale_x_log10() +
# # scale_y_log10() +
# theme(legend.position = "none") + theme_base() #+ xlim(0, 0.1)
log_lasso_cv_results = log_lasso_cv %>% collect_metrics() %>% group_by(.metric) %>% summarize(mean_accuracy = mean(mean, na.rm = T))
log_lasso_cv_results[1] = c("Accuracy",
"Precision",
"Area Under the Curve",
"Sensitivity",
"Specificity")
names(log_lasso_cv_results) = c("Metric", "Mean")
kable(log_lasso_cv_results, digits = 3) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))| Metric | Mean |
|---|---|
| Accuracy | 0.809 |
| Precision | 0.831 |
| Area Under the Curve | 0.508 |
| Sensitivity | 0.967 |
| Specificity | 0.040 |
# set.seed(9754)
set.seed(9753)
# converting outcome variable into character vector
email_train <- email_train %>%
mutate(
outcome_as_vector = ifelse(email_train_label == 1, "Yes", "No")
)
# Split for 5-fold cross-validation
folds <- vfold_cv(email_train, v = 5)
# small sample of data
# sample <- email_train[, ]
# Defining the recipe
data_recipe <- recipe(
outcome_as_vector ~ .,
data = email_train
) %>%
step_rm(email_train_label) %>%
update_role(V1, new_role = 'id variable') %>%
# step_num2factor( email_train_label ) %>%
step_dummy(all_nominal(), - all_outcomes()) %>%
step_zv(all_predictors()) %>%
step_normalize(all_predictors())
# Define the model
model_logistic <- logistic_reg(
mode = 'classification') %>% # Simple LogReg bc no penalty
set_engine('glm')
# Workflow
workflow_logistic <- workflow() %>%
add_recipe(data_recipe) %>%
add_model(model_logistic)
# Cross-Validation
cv_logistic <- workflow_logistic %>%
fit_resamples(
resamples = folds,
metrics = metric_set(accuracy, roc_auc, sens, spec, precision)
)# Visualizing output
log_reg_results = cv_logistic %>% collect_metrics() %>%
select(.metric, mean)
log_reg_results[1] = c("Accuracy",
"Precision",
"Area Under the Curve",
"Sensitivity",
"Specificity")
names(log_reg_results) = c("Metric", "Mean")
kable(log_reg_results, digits = 3) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))| Metric | Mean |
|---|---|
| Accuracy | 0.798 |
| Precision | 0.832 |
| Area Under the Curve | 0.511 |
| Sensitivity | 0.947 |
| Specificity | 0.071 |
random_forest <- train(
x = email_train_rf,
y = email_train_label_rf,
method = "ranger",
num.trees = 200,
importance = "impurity",
trControl = trainControl(method = "cv",
number = 3,
verboseIter = TRUE
)
)## Aggregating results
## Selecting tuning parameters
## Fitting mtry = 2, splitrule = extratrees, min.node.size = 1 on full training set
# Checking "variable importances
top_25words = varImp(random_forest, scale = TRUE)$importance %>%
rownames_to_column() %>%
arrange(-Overall) %>%
top_n(25) ## Selecting by Overall
# ggplot(data = top_25words) +
# aes(x = reorder(rowname, Overall), y = Overall) +
# geom_col(stat = "identify") +
# labs(title = "Most Predictive Words") +
# coord_flip()
ggplot(data = top_25words,
aes(x=reorder(rowname, Overall),
y = Overall)) +
geom_bar(stat = "identity") +
theme_base() +
theme(axis.text.x=element_text(angle=50, hjust=1))+
xlab("Top 25 Predictive Words(stemed)")+
ylab("Frequency of Word") +
labs(title = "Most Predictive Words") +
theme(plot.title = element_text(hjust = 0.5))# Redeclaring `email_test`
email_test <- apply( email_dtm_freq_test, MARGIN = 2,
convert_values )
email_test <- cbind( email_test,
email_test_label )
# Fitting to the test data
predictions <- predict(random_forest, email_test)
random_forest_results = confusionMatrix(predictions, email_test_label)
draw_confusion_matrix(random_forest_results)# Creating df with different metrics and results
comparing_acc_table = data.frame(
c(
"Naive Bayes",
"Lasso",
"Logistic Lasso",
"Logistic",
"Random Forest"
),
c(
naive_bayes_results[["overall"]][["Accuracy"]],
acc,
log_lasso_cv_results$Mean[1],
log_reg_results$Mean[1],
random_forest_results[["overall"]][["Accuracy"]]
),
c(
naive_bayes_results[["byClass"]][["Sensitivity"]],
0,
log_lasso_cv_results$Mean[4],
log_reg_results$Mean[4],
random_forest_results[["byClass"]][["Sensitivity"]]
),
c(
naive_bayes_results[["byClass"]][["Specificity"]],
1,
log_lasso_cv_results$Mean[5],
log_reg_results$Mean[5],
random_forest_results[["byClass"]][["Specificity"]]
)
)
names(comparing_acc_table) = c("Method", "Accuracy", "Sensitivity", "Specificity")
kable(comparing_acc_table, digits = 3) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))| Method | Accuracy | Sensitivity | Specificity |
|---|---|---|---|
| Naive Bayes | 0.788 | 0.044 | 0.921 |
| Lasso | 0.848 | 0.000 | 1.000 |
| Logistic Lasso | 0.809 | 0.967 | 0.040 |
| Logistic | 0.798 | 0.947 | 0.071 |
| Random Forest | 0.847 | 0.998 | 0.000 |